import pandas as pd
import seaborn as sns
import numpy as np
import os
import sys
import warnings
import plotly.io as pio
import plotly.graph_objects as go
import plotly.express as px
from matplotlib import pyplot as plt
from dotenv import load_dotenv
from sklearn.cluster import KMeans, DBSCAN
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import MinMaxScaler
from matplotlib.cm import ScalarMappable
from umap import UMAP
sys.path.append("../")
data_path = "../data"
from models.cx_groups import Cx_groups
from models.pca_clients import Cx_groups_post_pca
from models.easy_pca import Easy_pca
from scripts.optimizers_mp import k_means_optimizer
from scripts.df_actions import remove_outliers
pio.renderers.default = "notebook_connected"
load_dotenv()
sns.color_palette('colorblind')
plt.style.use('Solarize_Light2')
# Setting default DPI, pulling it from dotenv if it exists, setting it on 100 if not
try:
pc_dpi = int(os.getenv('DPI'))
except TypeError:
pc_dpi = 100
if pc_dpi is None:
pc_dpi = 100
Taking into account what RFM approach taught us and the clusters we extracted from the data :
olist_clients_dataset = "../final_datasets/olist_customers.pkl"
df_cx = pd.read_pickle(olist_clients_dataset)
df_cx.describe()
| customer_uid | recency | frequency | monetary | num_orders | cluster_kmeans_4 | cluster_DBSCAN | lat | lon | rating_avg | rating_ratio | comment_ratio | delta_delivery | expected_reality | distance_cx_seller | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 95922.000000 | 9.592200e+04 | 95922.000000 | 95922.000000 | 95922.000000 | 95922.000000 | 95922.000000 | 95653.000000 | 95653.000000 | 95211.000000 | 95922.000000 | 95922.000000 | 90212 | 90212 | 94683.000000 |
| mean | 48050.027408 | 2.488008e+07 | 0.144087 | 164.790904 | 1.034872 | 1.581326 | 0.040502 | -21.186477 | -46.175050 | 4.085653 | 0.992588 | 0.410418 | 12 days 10:59:52.732275085 | -12 days +02:32:54.229592516 | 602.671260 |
| std | 27740.300523 | 1.323897e+07 | 0.106734 | 227.880166 | 0.214573 | 1.210225 | 3.080952 | 5.628223 | 4.066423 | 1.341295 | 0.085775 | 0.490193 | 8 days 19:34:30.557032786 | 9 days 08:37:12.484771075 | 596.799614 |
| min | 1.000000 | 0.000000e+00 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | -36.605374 | -72.666706 | 1.000000 | 0.000000 | 0.000000 | 0 days 12:48:07 | -51 days +00:00:00 | 0.000000 |
| 25% | 24028.250000 | 1.415290e+07 | 0.071299 | 62.370000 | 1.000000 | 0.000000 | 0.000000 | -23.588447 | -48.110368 | 4.000000 | 1.000000 | 0.000000 | 6 days 18:15:44.750000 | -17 days +00:00:00 | 189.232300 |
| 50% | 48048.500000 | 2.322363e+07 | 0.105751 | 107.220000 | 1.000000 | 2.000000 | 0.000000 | -22.926003 | -46.631276 | 5.000000 | 1.000000 | 0.000000 | 10 days 04:58:03 | -12 days +00:00:00 | 434.458400 |
| 75% | 72077.750000 | 3.431603e+07 | 0.172824 | 182.100000 | 1.000000 | 3.000000 | 0.000000 | -20.134916 | -43.605058 | 5.000000 | 1.000000 | 1.000000 | 15 days 17:00:39.250000 | -7 days +00:00:00 | 799.043250 |
| max | 96095.000000 | 6.677370e+07 | 1.701609 | 13664.080000 | 17.000000 | 3.000000 | 255.000000 | 42.184003 | -8.577855 | 5.000000 | 1.000000 | 1.000000 | 103 days 21:34:37 | 55 days 00:00:00 | 8736.947600 |
df_cx.head()
| customer_uid | most_ancient_order_dt | most_recent_order_dt | recency | frequency | monetary | num_orders | cluster_kmeans_4 | cluster_DBSCAN | k_cluster_name | ... | order_id_list | rating_avg | rating_ratio | comment_ratio | has_rated | has_commented | delta_delivery | expected_reality | early_on_time | distance_cx_seller | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 2017-05-16 15:05:35 | 2017-05-16 15:05:35 | 44850283.0 | 0.053939 | 146.87 | 1 | 3 | 0 | early_majority | ... | [88493] | 4.0 | 1.0 | 0.0 | True | False | 8 days 19:30:00 | -11 days | True | 346.9776 |
| 1 | 2 | 2018-01-12 20:48:24 | 2018-01-12 20:48:24 | 24007314.0 | 0.100769 | 335.48 | 1 | 3 | 0 | early_majority | ... | [90419] | 5.0 | 1.0 | 0.0 | True | False | 16 days 15:52:55 | -8 days | True | 413.9531 |
| 2 | 3 | 2018-05-19 16:07:45 | 2018-05-19 16:07:45 | 13051353.0 | 0.185360 | 157.73 | 1 | 3 | 0 | early_majority | ... | [22558] | 5.0 | 1.0 | 0.0 | True | False | 26 days 01:51:06 | 1 days | False | 29.5741 |
| 3 | 4 | 2018-03-13 16:06:38 | 2018-03-13 16:06:38 | 18840220.0 | 0.128406 | 173.30 | 1 | 1 | 0 | investors | ... | [32181] | 5.0 | 1.0 | 0.0 | True | False | 14 days 23:57:47 | -13 days | True | 19.3538 |
| 4 | 5 | 2018-07-29 09:51:30 | 2018-07-29 09:51:30 | 6939528.0 | 0.348612 | 252.25 | 1 | 0 | 0 | late_majority | ... | [69903] | 5.0 | 1.0 | 1.0 | True | True | 11 days 11:04:18 | -6 days | True | 219.7266 |
5 rows × 22 columns
df_cx.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 95922 entries, 0 to 95921 Data columns (total 22 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 customer_uid 95922 non-null uint32 1 most_ancient_order_dt 95922 non-null datetime64[ns] 2 most_recent_order_dt 95922 non-null datetime64[ns] 3 recency 95922 non-null float64 4 frequency 95922 non-null float64 5 monetary 95922 non-null float64 6 num_orders 95922 non-null uint8 7 cluster_kmeans_4 95922 non-null uint8 8 cluster_DBSCAN 95922 non-null uint8 9 k_cluster_name 95922 non-null object 10 lat 95653 non-null float64 11 lon 95653 non-null float64 12 order_id_list 95922 non-null object 13 rating_avg 95211 non-null float64 14 rating_ratio 95922 non-null float64 15 comment_ratio 95922 non-null float64 16 has_rated 95922 non-null bool 17 has_commented 95922 non-null bool 18 delta_delivery 90212 non-null timedelta64[ns] 19 expected_reality 90212 non-null timedelta64[ns] 20 early_on_time 95922 non-null bool 21 distance_cx_seller 94683 non-null float64 dtypes: bool(3), datetime64[ns](2), float64(9), object(2), timedelta64[ns](2), uint32(1), uint8(3) memory usage: 11.9+ MB
Goals :
This will help to visualize how many people have left at least one Review, and if they commented it. We will see the involvement of customers, first across all the dataset. We call calculate the avg of all ratings to provide a reference/comparison point for ratings per cluster
Method :
We will first assess the amount of customers that have rated and/or commented. We will calculate the average ratio for ratings and comments. It will be interesting to see, first, those metrics on the dataset, then applied to our first successful segmentation (k-means w/ k=4)
# Checking if people can comment without rating :
commenters = df_cx[df_cx["has_commented"] == True]
print(f"{len(commenters[commenters['rating_avg'] == np.nan])}")
del commenters # Flush
0
def addlabels(x, y):
for i in range(len(x)):
plt.text(i, y[i], f"{y[i]}", ha="center", bbox=dict(facecolor="white", alpha=1))
# No they cant, okay so cx who have posted a comment have to rate the order.
ratings_avg_dswide = np.average(df_cx[df_cx["rating_avg"].notna()]["rating_avg"].values.tolist())
ratings_ratio_dswide = np.average(df_cx["rating_ratio"])
comment_ratio_dswide = np.average(df_cx["comment_ratio"])
rating_poster_dswide = len(df_cx[df_cx["has_rated"] == True])
comment_poster_dswide = len(df_cx[df_cx["has_commented"] == True])
no_rating = len(df_cx[df_cx["has_rated"] == False])
fig, (ax1) = plt.subplots(
ncols=1,
nrows=1,
figsize=(10, 6),
dpi=pc_dpi,
)
bars = {"rating_poster": rating_poster_dswide, "comment_and_rate": comment_poster_dswide, "no_review": no_rating}
cmap_one = ["#000331", "navy", "red"]
ax1.bar(x=list(bars.keys()), height=list(bars.values()), color=cmap_one)
###
# Titles/Lables
addlabels(x=[cat for cat in bars.keys()], y=[pop for pop in bars.values()])
ax1.set_ylabel("Population of group")
ax1.set_xlabel("Group")
ax1.tick_params(left=False)
fig.suptitle("Amount of users by their method of rating")
#
###
fig.tight_layout()
plt.show()
By design (Olist sharing mostly orders which have been subject to customer review) or by accident (very small non zero chance), the crushing majority of customers have rated their orders at least once (95211 out of 95922, or about 99.26% of customers).
There is also a considerable amount of customers who have assorted their reviews with a comment (You need to rate the order to comment it, at least, in this dataset, no commented order was unrated) : 39700 out of 95922 clients, which is about 41.39% of the total population.
Finally, a crushing minority of clients (711, which represents 0.74% of the dataset) have neither rated nor commented any of their orders. Due to the nature of the data, we can theorize that this concerns orders that have not been yet completed (delivered ?) at the time of SQL dump. We will include this in our report and inquire about those 711 customers.
We need to visualize at which ratio customers have reviewed and commented in average, but since most (but not all) customers have only placed one order, we can expect this number to quite close to 1, for ratings at least.
fig, (ax1) = plt.subplots(
ncols=1,
nrows=1,
figsize=(10, 6),
dpi=pc_dpi,
)
bars_ratios_dswide = {"review_ratio": ratings_ratio_dswide, "comment_ratio": comment_ratio_dswide}
cmap_one = ["#000331", "navy"]
ax1.bar(x=list(bars_ratios_dswide.keys()), height=list(bars_ratios_dswide.values()), color=cmap_one)
###
# Titles/Lables
addlabels(x=[cat for cat in bars_ratios_dswide.keys()], y=[pop for pop in bars_ratios_dswide.values()])
ax1.set_ylabel("Ratio (0-1)")
ax1.tick_params(left=False)
fig.suptitle("Average rating and comment ratio, datasetwide")
#
###
fig.tight_layout()
plt.show()
That was expected, this is in line with what we saw on the previous chart. Average ratio of ratings is close to 1 and comment ratio close to 0.4139 (our 41.39% from the comment group earlier).
We can conclude this analysis of the review ratio by stating that, being that close to 1 and seemingly almost mandatory for the order to be in the dataset, is of no use since it cannot efficiently help our clustering. We later drop this variable (keeping the boolean for now), but we suggest its use in the model chosen at the end of this analysis. According to this article : this article, in 2019, 47% of e-commerce customers leave a review each month. It would be necessary to see if that proportion is similar in the whole Olist database or it if keeps being that high.
Let's take a look at the rating distribution in the dataset and the average rating left by customers.
## Rounding up to 1/1.5/2/2.5/3/3.5/4/4.5/5
rounded_list = []
og_list = df_cx[df_cx["rating_avg"].notna()]["rating_avg"].values.tolist()
for rating in og_list:
rounded_list.append(round(rating * 2) / 2)
uniques = []
for rating in rounded_list:
if rating not in uniques:
uniques.append(rating)
uniques.sort()
print(uniques)
# Nice, feels like im reinventing the wheels there but that works
[1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0]
ratings_dswide = dict.fromkeys(uniques)
for rating in ratings_dswide:
ratings_dswide[rating] = rounded_list.count(rating)
fig, (ax1) = plt.subplots(
ncols=1,
nrows=1,
figsize=(10, 6),
dpi=pc_dpi,
)
colors = ["black", "dimgray", "darkred", "red", "yellow", "steelblue", "royalblue", "navy", "#000331"]
ratings = [str(number) for number in np.arange(1, 5.5, 0.5)]
amount = rating
ax1.bar(x=ratings, height=list(ratings_dswide.values()), color=colors)
###
# Titles/Lables
addlabels(x=[rating for rating in ratings_dswide.keys()], y=[pop for pop in ratings_dswide.values()])
#
###
fig.tight_layout()
plt.show()
Looks like there is an overwhelming number of very high ratings (5 = half of the customers). This looks overly optimistic but it is not unlikely as the study quoted earlier states that satisfied customers are more likely to give a review. Negative ones too but less so, according to this study (point 25).
So we can expect the average overall rating to be quite high. Let's check.
print(f"Average for the whole dataset = {ratings_avg_dswide}")
Average for the whole dataset = 4.085652664261414
Ok so we have our reference value, now we can use it as a baseline to determine clusters characteristics
Goals :
We might be able to distinguish different characteristics from cluster to cluster, hence giving us a way to compare and evaluate the relevance of a given variable. (Which could otherwise be done using SHAP)
Method :
For starters, Let's use names of clusters and not their numbers. It will make more sense to talk about people and not numbers.
We will use a radar plot to represent the most important statistics and their average per group :
## Min Max Scaler doesnt support timedeltas, so we need to convert timedeltas to ints (days)
def get_delta_days(row):
try:
return int(row["expected_reality"].days)
except ValueError:
return np.nan
df_cx["delta_days"] = df_cx.apply(get_delta_days, axis=1)
mms = MinMaxScaler(feature_range=(0, 5))
subset = ["rating_avg", "delta_days", "distance_cx_seller", "recency", "frequency"]
scaled_subset = [
"scaled_rating_avg", "scaled_delta_days",
"scaled_distance_cx_seller", "scaled_recency",
"scaled_frequency"
]
keepcols = ["rating_avg", "delta_days", "distance_cx_seller", "recency", "frequency", "k_cluster_name"]
df_cx_mms = df_cx[keepcols].copy()
df_cx_mms[scaled_subset] = mms.fit_transform(df_cx_mms[subset].to_numpy())
df_cx_mms.head()
| rating_avg | delta_days | distance_cx_seller | recency | frequency | k_cluster_name | scaled_rating_avg | scaled_delta_days | scaled_distance_cx_seller | scaled_recency | scaled_frequency | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 4.0 | -11.0 | 346.9776 | 44850283.0 | 0.053939 | early_majority | 3.75 | 1.886792 | 0.198569 | 3.358379 | 0.158495 |
| 1 | 5.0 | -8.0 | 413.9531 | 24007314.0 | 0.100769 | early_majority | 5.00 | 2.028302 | 0.236898 | 1.797662 | 0.296100 |
| 2 | 5.0 | 1.0 | 29.5741 | 13051353.0 | 0.185360 | early_majority | 5.00 | 2.452830 | 0.016925 | 0.977282 | 0.544661 |
| 3 | 5.0 | -13.0 | 19.3538 | 18840220.0 | 0.128406 | investors | 5.00 | 1.792453 | 0.011076 | 1.410752 | 0.377308 |
| 4 | 5.0 | -6.0 | 219.7266 | 6939528.0 | 0.348612 | late_majority | 5.00 | 2.122642 | 0.125746 | 0.519630 | 1.024359 |
investor_group = Cx_groups(cluster_name="investors", dataframe=df_cx_mms[df_cx_mms["k_cluster_name"] == "investors"])
early_m_group = Cx_groups(cluster_name="early_majority", dataframe=df_cx_mms[df_cx_mms["k_cluster_name"] == "early_majority"])
late_m_group = Cx_groups(cluster_name="late_majority", dataframe=df_cx_mms[df_cx_mms["k_cluster_name"] == "late_majority"])
lagg_group = Cx_groups(cluster_name="laggards", dataframe=df_cx_mms[df_cx_mms["k_cluster_name"] == "laggards"])
radar_inv = investor_group.store_radar()
em_radar = early_m_group.store_radar()
lm_radar = late_m_group.store_radar()
lagg_radar = lagg_group.store_radar()
# Let's superpose the radars : (Class Cx Group saves the trace)
fig = go.Figure()
fig.add_trace(go.Scatterpolar(investor_group.trace))
fig.add_trace(go.Scatterpolar(early_m_group.trace))
fig.add_trace(go.Scatterpolar(late_m_group.trace))
fig.add_trace(go.Scatterpolar(lagg_group.trace))
colors = ["#000331", "navy", "royalblue", "red"]
title = "Newly created stats, MinMaxed for each cluster determined during RFM segmentation Each cluster"
fig.update_layout(
title=title,
polar=dict(
radialaxis=dict(
visible=True,
range=[0, 5]
)),
showlegend=True
)
fig.show()
investor_group.get_standard_stats()
investor_group.standard_stats
Group_stats(rating_avg=4.0801387953803925, delta_days=-11.95422070784788, distance_cx_seller=603.6218243071827, recency=24705325.87966058, frequency=0.14536305281344908)
lagg_group.get_standard_stats()
lagg_group.standard_stats
Group_stats(rating_avg=4.079201838304257, delta_days=-11.880257357321577, distance_cx_seller=599.0734830752654, recency=24707661.49371684, frequency=0.14487645549994077)
df_cx.describe()
| customer_uid | recency | frequency | monetary | num_orders | cluster_kmeans_4 | cluster_DBSCAN | lat | lon | rating_avg | rating_ratio | comment_ratio | delta_delivery | expected_reality | distance_cx_seller | delta_days | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 95922.000000 | 9.592200e+04 | 95922.000000 | 95922.000000 | 95922.000000 | 95922.000000 | 95922.000000 | 95653.000000 | 95653.000000 | 95211.000000 | 95922.000000 | 95922.000000 | 90212 | 90212 | 94683.000000 | 90212.000000 |
| mean | 48050.027408 | 2.488008e+07 | 0.144087 | 164.790904 | 1.034872 | 1.581326 | 0.040502 | -21.186477 | -46.175050 | 4.085653 | 0.992588 | 0.410418 | 12 days 10:59:52.732275085 | -12 days +02:32:54.229592516 | 602.671260 | -11.893817 |
| std | 27740.300523 | 1.323897e+07 | 0.106734 | 227.880166 | 0.214573 | 1.210225 | 3.080952 | 5.628223 | 4.066423 | 1.341295 | 0.085775 | 0.490193 | 8 days 19:34:30.557032786 | 9 days 08:37:12.484771075 | 596.799614 | 9.359172 |
| min | 1.000000 | 0.000000e+00 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | -36.605374 | -72.666706 | 1.000000 | 0.000000 | 0.000000 | 0 days 12:48:07 | -51 days +00:00:00 | 0.000000 | -51.000000 |
| 25% | 24028.250000 | 1.415290e+07 | 0.071299 | 62.370000 | 1.000000 | 0.000000 | 0.000000 | -23.588447 | -48.110368 | 4.000000 | 1.000000 | 0.000000 | 6 days 18:15:44.750000 | -17 days +00:00:00 | 189.232300 | -17.000000 |
| 50% | 48048.500000 | 2.322363e+07 | 0.105751 | 107.220000 | 1.000000 | 2.000000 | 0.000000 | -22.926003 | -46.631276 | 5.000000 | 1.000000 | 0.000000 | 10 days 04:58:03 | -12 days +00:00:00 | 434.458400 | -12.000000 |
| 75% | 72077.750000 | 3.431603e+07 | 0.172824 | 182.100000 | 1.000000 | 3.000000 | 0.000000 | -20.134916 | -43.605058 | 5.000000 | 1.000000 | 1.000000 | 15 days 17:00:39.250000 | -7 days +00:00:00 | 799.043250 | -7.000000 |
| max | 96095.000000 | 6.677370e+07 | 1.701609 | 13664.080000 | 17.000000 | 3.000000 | 255.000000 | 42.184003 | -8.577855 | 5.000000 | 1.000000 | 1.000000 | 103 days 21:34:37 | 55 days 00:00:00 | 8736.947600 | 55.000000 |
As we can see here and in the plots above, the ratings are unexpectedly high and the standard deviation very low. On our sample, customers are overall satisfied/very satisfied. This behaviors with a very high mean and average, and very low std, even between the groups, is not beneficial pour any clustering model : if there are no noticeable differences between the clients, it will prove difficult to propose a comprehensive clustering mean to help Olist with customer segmentation. If the gain extracted from this new variable does not improve the model but on the contrary, flaws it, we will recommend the simple RFM approach.
df_cx["rating_avg"].describe()
count 95211.000000 mean 4.085653 std 1.341295 min 1.000000 25% 4.000000 50% 5.000000 75% 5.000000 max 5.000000 Name: rating_avg, dtype: float64
df_cx["rating_avg"].quantile([0.1, 0.25, 0.50, 0.75, 0.90])
0.10 1.0 0.25 4.0 0.50 5.0 0.75 5.0 0.90 5.0 Name: rating_avg, dtype: float64
fig, (ax1) = plt.subplots(
ncols=1,
nrows=1,
figsize=(8, 2),
dpi=pc_dpi,
)
g = sns.boxplot(x="rating_avg", data=df_cx, ax=ax1)
###
# Titles/Lables
fig.suptitle("Rating repartition across the dataset")
#
###
fig.tight_layout()
plt.show()
Ok so more than 75% of the customer sample did not rate, in average, under 4. If there is something better to use as a satisfaction indicator, we'll take it. Meanwhile, let's investigate on this sub-sample of customers : the ones that have given a 2.5 rating or worse in average.
df_cx_sample = df_cx[df_cx["rating_avg"] <= 2.5]
print(f"{len(df_cx_sample)} have rated under 2.5")
13909 have rated under 2.5
df_cx_sample["k_cluster_name"].value_counts()
early_majority 4624 late_majority 3825 investors 2832 laggards 2628 Name: k_cluster_name, dtype: int64
Ok so there's a lot of every one based from our previous clustering. Most important groups are the two majorities (which represent both around 32/35%), which was to be expected. What is interesting is to see that our "laggards" from the previous segmentation (which represented 18% of the dataset) are more or less as important here as the group we called "investors" in the first clustering. While they both represent around 20%, we have almost an equal amount of both. Which is interesting as we did not expect the "investor" group to give low ratings, on the contrary. This is a good indication that we lacked information in our RFM clustering and what we obtained during the feature engineering will likely improve the initial segmentation. In the rest of the analysis, we will also compare this group specifically and maybe diagnose the nature of this "low" rating.
While we have two timedelta variables for this indicator, there is one which will likely not result in an improvement for the clustering.
Indeed, the raw value of ∆Order/Delivery might not be as useful as expected, and ∆expected/delivery would be much more precise.
Let's say I pre-order something (P1) on the internet that's supposed to come out in 1 month, and I order another thing (P2) that I expect in a week. If every thing goes according to what has been indicated, I will receive P1 in a month and P2 in a week, those two orders won't have the same ∆order/delivery and that will impact negatively P1 (∆ = 1 month) over P2 (∆ = 1 week).
Now the ∆Expected/Delivered takes into account the initial expectation. So in the case all is fine and on time, ∆ will be 0 for both P1 and P2, a negative ∆ will indicate that it was delivered early and a positive one, late. This is likely way more precise.
We will use ∆expected/delivery from now on. Let's see it this has an impact on Cx satisfaction.
x_axis = df_cx[df_cx["delta_days"].notna()]["delta_days"].unique()
x_height = dict.fromkeys(x_axis)
for x in x_height:
x_height[x] = np.average(df_cx[(df_cx["delta_days"] == x) & (df_cx["rating_avg"].notna())]["rating_avg"].values.tolist())
fig, (ax1) = plt.subplots(
ncols=1,
nrows=1,
figsize=(8, 4),
dpi=pc_dpi,
)
ax1.scatter(x=list(x_height.keys()), y=list(x_height.values()))
###
# Titles/Lables
ax1.set_xlabel("Delta Days between expected and actual delivery")
ax1.set_ylabel("Average Rating for a given Delta Day")
fig.suptitle("Average rating by Days elapsed between expected and actual delivery")
#
###
fig.tight_layout()
plt.show()
As expected, clients' ratings are related to the punctuality of Olist. If the order arrives early or on time, clients' ratings are high, and this trend inverses as delta=0 is reached. People are more dissatisfied for every day Olist is late. The +30 days relatively high average bump in rating can probably be attributed to the lack of orders delivered beyond 30 days. Let's check if our dataset containing our relatively dissatisfied customers contains more clients that have been delivered late.
amount_eot_all = len(df_cx[df_cx["early_on_time"] == True])
amount_eot_sample = len(df_cx_sample[df_cx_sample["early_on_time"] == True])
percentage_all = (amount_eot_all / len(df_cx)) * 100
percentage_sample = (amount_eot_sample / len(df_cx_sample)) * 100
print(f"There is {percentage_all}% of who have been delivered on time on the whole dataset")
print(f"There is {percentage_sample}% of who have been delivered\
on time amongst the clients who have rated 2.5 or less (avg)")
There is 90.80607159984154% of who have been delivered on time on the whole dataset There is 58.15658925875332% of who have been delivered on time amongst the clients who have rated 2.5 or less (avg)
There is indeed a great difference between the sample of dissatisfied customers and the whole dataset. 90.8% of the clients have received their orders early or on time on the whole dataset and just 58.2% of the clients for the sample containing people who have left a rating of 2.5 or less (avg.). We can safely say that a big part of the dissatisfied customers were disappointed because they were not delivered on time.
print(f"There is {df_cx['distance_cx_seller'].isna().sum()} NA values in the dataset")
print(f"The Maximum distance is {df_cx['distance_cx_seller'].max()} Kms between seller and cx")
df_cx["distance_cx_seller"].describe()
There is 1239 NA values in the dataset The Maximum distance is 8736.9476 Kms between seller and cx
count 94683.000000 mean 602.671260 std 596.799614 min 0.000000 25% 189.232300 50% 434.458400 75% 799.043250 max 8736.947600 Name: distance_cx_seller, dtype: float64
It seems we have a lot of NA (which we will fill with the median, here 434.45 and change). Let's also plot the distribution to get a better sense of the extremes, which we will include as "more/less than whisker" to avoid over representation of the extremes
distances = df_cx[df_cx["distance_cx_seller"].notna()]["distance_cx_seller"].values.tolist()
fig, (ax1) = plt.subplots(
ncols=1,
nrows=1,
figsize=(8, 4),
dpi=pc_dpi,
)
ax1.boxplot(x=distances, vert=False, showbox=True, showmeans=True, widths=(0.4))
###
# Titles/Lables
ax1.set_xlabel("Number of average kilometers between customer and seller")
fig.suptitle("Representation of clients based on their average distance to seller when order was placed")
ax1.set_yticks([])
#
###
fig.tight_layout()
plt.show()
Looks like most people order between 0 and 2k Kms, lets zoom on this area
fig, (ax1) = plt.subplots(
ncols=1,
nrows=1,
figsize=(8, 4),
dpi=pc_dpi,
)
ax1.boxplot(x=distances, vert=False, showbox=True, showmeans=True, widths=(0.4))
###
# Titles/Lables
ax1.set_xlabel("Number of average kilometers between customer and seller")
fig.suptitle("Representation of clients based on their average distance to seller when order was placed\
\nzoom on less than 2000")
ax1.set_xlim(left=(-100), right=(2000))
ax1.set_xticks(range(0, 2000, 200))
ax1.set_yticks([])
#
###
fig.tight_layout()
plt.show()
Let's fillna with the median, get 10/90% values and take a peak
df_cx["distance_cx_seller"] = df_cx["distance_cx_seller"].fillna(df_cx["distance_cx_seller"].median())
distances_post_fill = df_cx[df_cx["distance_cx_seller"].notna()]["distance_cx_seller"].values.tolist()
quantiles_distance = df_cx["distance_cx_seller"].quantile([0.10, 0.90])
llim = round(quantiles_distance[0.10], ndigits=4) # Rounded to meter
rlim = round(quantiles_distance[0.90], ndigits=4) # Rounded to meter
print(f"10% under {llim} Kms, 90% under {rlim} Kms")
10% under 36.4582 Kms, 90% under 1455.8878 Kms
fig, (ax1) = plt.subplots(
ncols=1,
nrows=1,
figsize=(8, 4),
dpi=pc_dpi,
)
ax1.boxplot(x=distances_post_fill, vert=False, showbox=True, showmeans=True, widths=(0.4), sym="")
###
# Titles/Lables
ax1.set_xlabel("Number of average kilometers between customer and seller")
fig.suptitle("Representation of clients based on their average distance to seller when order was placed\
\nzoom on less than 2000, fliers removed")
ax1.set_xlim(left=(-100), right=(2000))
ax1.set_xticks(range(0, 2000, 200))
ax1.set_yticks([])
#
###
fig.tight_layout()
plt.show()
Filling NA by the median didn't impact negatively our data. Let's set our range to : under 50 kms, above 1500 kms with step 100kms.
distance_range = np.arange(50, 1551, 100)
distance_dict = dict.fromkeys(distance_range)
previous = 0
for distance in distance_range:
if distance != distance_range[-1]:
amnt_orders = df_cx[(df_cx["distance_cx_seller"] > previous)
& (df_cx["distance_cx_seller"] < distance)]["num_orders"].sum()
else: # Last distance of distance range, just need "more than distance"
amnt_orders = df_cx[(df_cx["distance_cx_seller"] > 1550)]["num_orders"].sum()
distance_dict[distance] = amnt_orders
previous = distance
def addlabels_ranged100(x, y):
for i in range(len(x)):
plt.text(i * 100 + 50, y[i], f"{y[i]}", ha="center", bbox=dict(facecolor="white", alpha=1))
fig, (ax1) = plt.subplots(
ncols=1,
nrows=1,
figsize=(8, 6),
dpi=pc_dpi,
)
magma = plt.get_cmap("magma").reversed()
height_normalized = [value / max(list(distance_dict.values())) for value in list(distance_dict.values())]
colors = magma(height_normalized)
ax1.bar(x=list(distance_dict.keys()), height=list(distance_dict.values()), width=90, color=colors)
###
# Titles/Lables
ax1.set_xticks(distance_range)
ax1.set_xlabel("Average distance from seller")
addlabels_ranged100(x=[distance for distance in distance_dict.keys()], y=[amnt for amnt in distance_dict.values()])
#
###
fig.tight_layout()
plt.show()
Note : As we drew our limits between 10% and 90% of the dataset, the high amount of order above 1550 kms is to be expected as it represents 10% of the dataset
This graph shows that, indeed, people orders are more often than not to seller that are relatively close, orders from seller above 550 kms away begin to drop quite quickly. We can hypothesize that expected delivery time is shorter for shorter distances and clients prefer to get an item quickly. Although there is a non negligible part of orders ranging from 650km and above, it is possible that it is for specific products that have no alternatives locally or from customers who live in more remotes part of Brazil.
This is a more interesting variable than expected. As we know, alternatives like Amazon for example use a subscription method like Prime to get rid of delivery fees altogether. It hints that it is not the case here or there is an incentive (fee, time etc.)
PCA will work better on certain data, while I have coded a layer of abstraction above it to make it easier to use and streamline the process, we still need to select our variables accordingly.
By default, converting a boolean column into a int column will do the 1/0 conversion for us, not necessary to reinvent the wheel there
We will begin by using IQR method to remove outliers from the dataset.
def delta_creation(row):
"""
Returns the difference between now and the most ancient order
Most ancient order is understood as the account creation date for lack
of more specific info
"""
now = np.datetime64("now")
then = row["most_ancient_order_dt"]
delta = now - then
delta = np.timedelta64(delta, "D")
return delta.astype(int)
Let's first convert "most_ancient_order_dt", which we assimilate to the account creation. We want the number of days between then and now\ We'll keep the col for the stability analysis.
df_cx["delta_creation"] = df_cx.apply(delta_creation, axis=1)
df_cx["has_rated"] = df_cx["has_rated"].astype(int)
df_cx["has_commented"] = df_cx["has_commented"].astype(int)
df_cx["early_on_time"] = df_cx["early_on_time"].astype(int)
df_cx.head()
| customer_uid | most_ancient_order_dt | most_recent_order_dt | recency | frequency | monetary | num_orders | cluster_kmeans_4 | cluster_DBSCAN | k_cluster_name | ... | rating_ratio | comment_ratio | has_rated | has_commented | delta_delivery | expected_reality | early_on_time | distance_cx_seller | delta_days | delta_creation | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 2017-05-16 15:05:35 | 2017-05-16 15:05:35 | 44850283.0 | 0.053939 | 146.87 | 1 | 3 | 0 | early_majority | ... | 1.0 | 0.0 | 1 | 0 | 8 days 19:30:00 | -11 days | 1 | 346.9776 | -11.0 | 1969 |
| 1 | 2 | 2018-01-12 20:48:24 | 2018-01-12 20:48:24 | 24007314.0 | 0.100769 | 335.48 | 1 | 3 | 0 | early_majority | ... | 1.0 | 0.0 | 1 | 0 | 16 days 15:52:55 | -8 days | 1 | 413.9531 | -8.0 | 1728 |
| 2 | 3 | 2018-05-19 16:07:45 | 2018-05-19 16:07:45 | 13051353.0 | 0.185360 | 157.73 | 1 | 3 | 0 | early_majority | ... | 1.0 | 0.0 | 1 | 0 | 26 days 01:51:06 | 1 days | 0 | 29.5741 | 1.0 | 1601 |
| 3 | 4 | 2018-03-13 16:06:38 | 2018-03-13 16:06:38 | 18840220.0 | 0.128406 | 173.30 | 1 | 1 | 0 | investors | ... | 1.0 | 0.0 | 1 | 0 | 14 days 23:57:47 | -13 days | 1 | 19.3538 | -13.0 | 1668 |
| 4 | 5 | 2018-07-29 09:51:30 | 2018-07-29 09:51:30 | 6939528.0 | 0.348612 | 252.25 | 1 | 0 | 0 | late_majority | ... | 1.0 | 1.0 | 1 | 1 | 11 days 11:04:18 | -6 days | 1 | 219.7266 | -6.0 | 1531 |
5 rows × 24 columns
## IQR
print("Before", len(df_cx))
Before 95922
target_iqr = [
"num_orders", "rating_avg", "distance_cx_seller",
"delta_days", "delta_creation"
]
for column in df_cx.columns:
if column in target_iqr:
remove_outliers(column_eval=column, df=df_cx)
print("After", len(df_cx))
After 85566
dropcols = [
"customer_uid", "cluster_kmeans_4", "cluster_DBSCAN",
"k_cluster_name", "delta_delivery", "expected_reality",
"order_id_list", "most_recent_order_dt", "lat", "lon",
"most_ancient_order_dt",
]
df_model = df_cx.drop(columns=(dropcols), errors="ignore")
df_model.head()
| recency | frequency | monetary | num_orders | rating_avg | rating_ratio | comment_ratio | has_rated | has_commented | early_on_time | distance_cx_seller | delta_days | delta_creation | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 44850283.0 | 0.053939 | 146.87 | 1 | 4.0 | 1.0 | 0.0 | 1 | 0 | 1 | 346.9776 | -11.0 | 1969 |
| 1 | 24007314.0 | 0.100769 | 335.48 | 1 | 5.0 | 1.0 | 0.0 | 1 | 0 | 1 | 413.9531 | -8.0 | 1728 |
| 2 | 13051353.0 | 0.185360 | 157.73 | 1 | 5.0 | 1.0 | 0.0 | 1 | 0 | 0 | 29.5741 | 1.0 | 1601 |
| 3 | 18840220.0 | 0.128406 | 173.30 | 1 | 5.0 | 1.0 | 0.0 | 1 | 0 | 1 | 19.3538 | -13.0 | 1668 |
| 4 | 6939528.0 | 0.348612 | 252.25 | 1 | 5.0 | 1.0 | 1.0 | 1 | 1 | 1 | 219.7266 | -6.0 | 1531 |
For more info about the class Easy_pca, documentation is provided in the model as docstring
epca_model = Easy_pca(dataset=df_model)
fig = epca_model.get_scree_plot(show=False)
fig.set_dpi(pc_dpi)
fig.set_figheight(5)
fig.set_figwidth(8)
fig.tight_layout()
fig.show()
/var/folders/3s/s8sp6jwn6qs02jfxbgjc7c_40000gn/T/ipykernel_2482/124908471.py:8: UserWarning: Matplotlib is currently using module://matplotlib_inline.backend_inline, which is a non-GUI backend, so cannot show the figure.
Looks like we get around 90% cumulative inertia with PC1 through PC5. Let's take a closer look.
fig = epca_model.display_circles(couple_pc=(0, 1), show=False)
fig.set_dpi(pc_dpi)
fig.set_figheight(5)
fig.set_figwidth(5)
fig.tight_layout()
fig.show()
/var/folders/3s/s8sp6jwn6qs02jfxbgjc7c_40000gn/T/ipykernel_2482/2243853122.py:8: UserWarning: Matplotlib is currently using module://matplotlib_inline.backend_inline, which is a non-GUI backend, so cannot show the figure.
fig = epca_model.display_circles(couple_pc=(2, 3), show=False)
fig.set_dpi(pc_dpi)
fig.set_figheight(5)
fig.set_figwidth(5)
fig.show()
/var/folders/3s/s8sp6jwn6qs02jfxbgjc7c_40000gn/T/ipykernel_2482/1128338058.py:7: UserWarning: Matplotlib is currently using module://matplotlib_inline.backend_inline, which is a non-GUI backend, so cannot show the figure.
fig = epca_model.display_circles(couple_pc=(3, 4), show=False)
fig.set_dpi(pc_dpi)
fig.set_figheight(5)
fig.set_figwidth(5)
fig.show()
/var/folders/3s/s8sp6jwn6qs02jfxbgjc7c_40000gn/T/ipykernel_2482/102305747.py:7: UserWarning: Matplotlib is currently using module://matplotlib_inline.backend_inline, which is a non-GUI backend, so cannot show the figure.
epca_model.biplot((0, 1))
epca_model.biplot((2, 3))
epca_model.biplot((3, 4))
df_contribution = epca_model.show_contribution(lim_pc=5)
df_contribution
| PC1 | PC2 | PC3 | PC4 | PC5 | |
|---|---|---|---|---|---|
| recency | -5.839749e-01 | 0.061885 | -1.397124e-02 | -4.374793e-02 | -3.328919e-02 |
| frequency | 5.407418e-01 | -0.044440 | 1.523987e-03 | 6.366289e-02 | 1.824991e-02 |
| monetary | -6.013037e-03 | -0.040135 | 1.608210e-02 | 6.883383e-02 | 6.452213e-01 |
| num_orders | 1.387779e-17 | -0.000000 | 2.775558e-17 | 1.110223e-16 | 5.551115e-17 |
| rating_avg | 5.341263e-02 | 0.377926 | -1.860646e-01 | 2.836695e-01 | -1.405585e-01 |
| rating_ratio | 1.334033e-02 | -0.173454 | -6.434531e-01 | -2.335224e-01 | 2.432052e-02 |
| comment_ratio | -7.940703e-02 | -0.566676 | 9.576399e-03 | 3.862057e-01 | -9.372702e-02 |
| has_rated | 1.334033e-02 | -0.173454 | -6.434531e-01 | -2.335224e-01 | 2.432052e-02 |
| has_commented | -7.940703e-02 | -0.566676 | 9.576399e-03 | 3.862057e-01 | -9.372702e-02 |
| early_on_time | 2.186455e-02 | 0.304277 | -2.883941e-01 | 4.878433e-01 | -9.015474e-02 |
| distance_cx_seller | -5.782445e-02 | -0.018728 | -3.576837e-03 | 8.081852e-02 | 7.211011e-01 |
| delta_days | 7.658825e-02 | -0.224894 | 2.308622e-01 | -5.049915e-01 | -1.204827e-01 |
| delta_creation | -5.839762e-01 | 0.061881 | -1.396981e-02 | -4.373149e-02 | -3.329034e-02 |
| Inertia | 2.320000e+01 | 19.100000 | 1.700000e+01 | 1.270000e+01 | 9.100000e+00 |
We have there the coefficients of each variable, to get a better grasp on each one and to see what subset of variables contribute the most, we will use abs values for each PC, and rename it as we go.
abs(df_contribution["PC1"]).sort_values(ascending=False)
Inertia 2.320000e+01 delta_creation 5.839762e-01 recency 5.839749e-01 frequency 5.407418e-01 comment_ratio 7.940703e-02 has_commented 7.940703e-02 delta_days 7.658825e-02 distance_cx_seller 5.782445e-02 rating_avg 5.341263e-02 early_on_time 2.186455e-02 has_rated 1.334033e-02 rating_ratio 1.334033e-02 monetary 6.013037e-03 num_orders 1.387779e-17 Name: PC1, dtype: float64
As we can see, PC1 represents Time. The frequency is positively correlated with PC1 while delta_creation and the recency are negatively correlated. It is basically the time since last order.
abs(df_contribution["PC2"]).sort_values(ascending=False)
Inertia 19.100000 comment_ratio 0.566676 has_commented 0.566676 rating_avg 0.377926 early_on_time 0.304277 delta_days 0.224894 has_rated 0.173454 rating_ratio 0.173454 recency 0.061885 delta_creation 0.061881 frequency 0.044440 monetary 0.040135 distance_cx_seller 0.018728 num_orders 0.000000 Name: PC2, dtype: float64
While there is more of a mix in PC2, we can clearly see that it represents the customer's involvement with the order : the variables on ratings and comments are both very important in this PC. While we see delta_days scoring quite high, we already established it was highly correlated with the ratings and user's involvement.
abs(df_contribution["PC3"]).sort_values(ascending=False)
Inertia 1.700000e+01 has_rated 6.434531e-01 rating_ratio 6.434531e-01 early_on_time 2.883941e-01 delta_days 2.308622e-01 rating_avg 1.860646e-01 monetary 1.608210e-02 recency 1.397124e-02 delta_creation 1.396981e-02 comment_ratio 9.576399e-03 has_commented 9.576399e-03 distance_cx_seller 3.576837e-03 frequency 1.523987e-03 num_orders 2.775558e-17 Name: PC3, dtype: float64
While PC2 and PC3 might seem similar, PC3 seems entirely focused on ratings and seems to ignore comments. early_on_time and delta_days score also high, likely for the same reason as PC2. Taking the real values (not abs), we can see that rating avg. and comment ratio are anti-correlated to the variable "delta_days", it is interpretable as general dissatisfaction : when the value of delta days climbs, the ratings fall, as we have established in 2.1
abs(df_contribution["PC4"]).sort_values(ascending=False)
Inertia 1.270000e+01 delta_days 5.049915e-01 early_on_time 4.878433e-01 comment_ratio 3.862057e-01 has_commented 3.862057e-01 rating_avg 2.836695e-01 has_rated 2.335224e-01 rating_ratio 2.335224e-01 distance_cx_seller 8.081852e-02 monetary 6.883383e-02 frequency 6.366289e-02 recency 4.374793e-02 delta_creation 4.373149e-02 num_orders 1.110223e-16 Name: PC4, dtype: float64
The absolute values of PC3 and PC4 are very similar, for a very good reason : while PC3 represents discontentment, PC4 represents satisfaction : delta days is highly negatively correlated with PC4, while rating average and early/on time are highly positively correlated with PC4. If delta days is low, and the order is on time or early, cx is happy.
abs(df_contribution["PC5"]).sort_values(ascending=False)
Inertia 9.100000e+00 distance_cx_seller 7.211011e-01 monetary 6.452213e-01 rating_avg 1.405585e-01 delta_days 1.204827e-01 comment_ratio 9.372702e-02 has_commented 9.372702e-02 early_on_time 9.015474e-02 delta_creation 3.329034e-02 recency 3.328919e-02 has_rated 2.432052e-02 rating_ratio 2.432052e-02 frequency 1.824991e-02 num_orders 5.551115e-17 Name: PC5, dtype: float64
This PC is highly correlated with 3 variables : monetary, num_orders and distance_cx_seller (negatively) : we established in 3.1 that order amounts decreased with the distance so it makes sense. We can think of PC5 as a variable that describes the volume of transaction and the overall sum of money people spend on Olist.
pc_keep = ["PC1", "PC2", "PC3", "PC4", "PC5"]
df_model_pca = epca_model.pcframe[pc_keep]
df_model_pca.head()
| PC1 | PC2 | PC3 | PC4 | PC5 | |
|---|---|---|---|---|---|
| 0 | -2.057101 | 1.183986 | -0.225623 | -0.804880 | -0.239802 |
| 1 | 0.071706 | 1.138780 | -0.229457 | -0.521352 | 0.420751 |
| 2 | 1.541139 | -0.235138 | 1.024167 | -2.749404 | -0.598572 |
| 3 | 0.694076 | 1.254976 | -0.355856 | -0.325240 | -0.736289 |
| 4 | 2.579234 | -1.459571 | -0.103386 | 1.132912 | -0.487445 |
rename_dict = dict.fromkeys(pc_keep)
rename_dict["PC1"] = "time_impact"
rename_dict["PC2"] = "general_involvment"
rename_dict["PC3"] = "overall_discontentment"
rename_dict["PC4"] = "overall_satisfaction"
rename_dict["PC5"] = "value_and_volume"
df_model_pca = df_model_pca.rename(columns=rename_dict)
df_model_pca.head()
| time_impact | general_involvment | overall_discontentment | overall_satisfaction | value_and_volume | |
|---|---|---|---|---|---|
| 0 | -2.057101 | 1.183986 | -0.225623 | -0.804880 | -0.239802 |
| 1 | 0.071706 | 1.138780 | -0.229457 | -0.521352 | 0.420751 |
| 2 | 1.541139 | -0.235138 | 1.024167 | -2.749404 | -0.598572 |
| 3 | 0.694076 | 1.254976 | -0.355856 | -0.325240 | -0.736289 |
| 4 | 2.579234 | -1.459571 | -0.103386 | 1.132912 | -0.487445 |
Moving forward we will use this dataset (same index) for clustering.
We will export the final dataset and move on to another document solely focused on maintenance of the chosen algorithm.
Why ? : This notebook is quite heavy a resource consuming. We will take the maintenance as a separate document to avoid :
1 : Overusing the kernel which is already pretty full and will execute slowly on different setups (native jupyter, jupter-lab, vscode etc.)
df_model_pca.head()
| time_impact | general_involvment | overall_discontentment | overall_satisfaction | value_and_volume | |
|---|---|---|---|---|---|
| 0 | -2.057101 | 1.183986 | -0.225623 | -0.804880 | -0.239802 |
| 1 | 0.071706 | 1.138780 | -0.229457 | -0.521352 | 0.420751 |
| 2 | 1.541139 | -0.235138 | 1.024167 | -2.749404 | -0.598572 |
| 3 | 0.694076 | 1.254976 | -0.355856 | -0.325240 | -0.736289 |
| 4 | 2.579234 | -1.459571 | -0.103386 | 1.132912 | -0.487445 |
k_range = range(2, 10)
k_means_optimizer(data=df_model_pca, k_range=k_range)
With this approach, 5 | 6 Clusters seems to be the way to go, it maximizes the Silhouette Score while having an acceptable SSE Trying 6 first as it has the best Sil Score, otherwise, 5 "might" be a bit more explainable, we'll see in the groups population, we will iterate 5-6 if there's a cluster with less than 1000 customers or so.
-- > 5 is more global, 6 does not seem to cluster correctly
km = KMeans(n_clusters=5)
y_predicted = km.fit_predict(df_model_pca)
df_model_pca["cluster_km"] = y_predicted
df_cx["cluster_km"] = y_predicted # Index was kept between the dimsensionnal reductions.
We will also use DBSCAN with the same idea as RFM #4.2, with min points as dimension + 1 (4) Epsilon can be determined using k neighbors. Using a graph to represent the avg distance between a point and its k-neighbors (here 4 : dimension + 1). Zooming in and using the elbow method help us to focus on the best potential epsilon.
neighbors_matrix = df_model_pca.drop(columns=["cluster_km"]).to_numpy()
nneighbors = NearestNeighbors(n_neighbors=4, n_jobs=-1) # dataset dim + 1
nneighbors.fit(X=neighbors_matrix)
distances, potential_eps = nneighbors.kneighbors(neighbors_matrix)
distances = np.sort(distances, axis=0)
distances_plot = distances[:,1]
fig, (ax1) = plt.subplots(
ncols=1,
nrows=1,
figsize=(16, 8),
dpi=pc_dpi,
)
ax1.plot(distances_plot)
###
# Titles/Lables
ax1.set_xlabel("Object")
ax1.set_ylabel("k distance")
fig.suptitle("Points sorted by distance - Neighbors = 6")
#
###
fig.tight_layout()
plt.show()
# Lets innovate a bit and use plotly to zoom in
fig = px.line(distances_plot)
fig.show()
Around eps = 1.1 | 1.2 seems to be the tipping point
dbs = DBSCAN(eps=1.15, min_samples=6, n_jobs=-1)
y_predict_dbs = dbs.fit_predict(df_model_pca.drop(columns=["cluster_k4", "cluster_DBSCAN"], errors="ignore"))
df_model_pca["cluster_DBSCAN"] = y_predict_dbs
df_model_pca["cluster_DBSCAN"].unique()
array([ 0, 1, 2, 3, 4, 5, -1, 7, 6])
Theres a lot more clusters than Kmeans.
Since the dataset seems linearly separable, DBSCAN doesn't do a great job at clustering.
While it is very good at dimensional reduction, it makes the axes hard to interpret. It helps visualizing but the more robust explanation will be through radars of the average and median of each group.
We will use both radar charts to plot the average and/or median client/cluster and UMAP to reduce the dimensions from 6 to 3, making it possible to visualize.
df_model_pca.head()
| time_impact | general_involvment | overall_discontentment | overall_satisfaction | value_and_volume | cluster_km | cluster_DBSCAN | |
|---|---|---|---|---|---|---|---|
| 0 | -2.057101 | 1.183986 | -0.225623 | -0.804880 | -0.239802 | 4 | 0 |
| 1 | 0.071706 | 1.138780 | -0.229457 | -0.521352 | 0.420751 | 4 | 0 |
| 2 | 1.541139 | -0.235138 | 1.024167 | -2.749404 | -0.598572 | 3 | 0 |
| 3 | 0.694076 | 1.254976 | -0.355856 | -0.325240 | -0.736289 | 0 | 1 |
| 4 | 2.579234 | -1.459571 | -0.103386 | 1.132912 | -0.487445 | 0 | 2 |
df_model_pca_no_dbs = df_model_pca.drop(columns=["cluster_DBSCAN"], errors="ignore")
k_zero = Cx_groups_post_pca(dataframe=df_model_pca_no_dbs, k_group=0, cluster_col="cluster_km")
k_one = Cx_groups_post_pca(dataframe=df_model_pca_no_dbs, k_group=1, cluster_col="cluster_km")
k_two = Cx_groups_post_pca(dataframe=df_model_pca_no_dbs, k_group=2, cluster_col="cluster_km")
k_three = Cx_groups_post_pca(dataframe=df_model_pca_no_dbs, k_group=3, cluster_col="cluster_km")
k_four = Cx_groups_post_pca(dataframe=df_model_pca_no_dbs, k_group=4, cluster_col="cluster_km")
radar_zero = k_zero.store_radar()
radar_one = k_one.store_radar()
radar_two = k_two.store_radar()
radar_three = k_three.store_radar()
radar_four = k_four.store_radar()
# Let's superpose the radars : (Class Cx Group saves the trace)
fig = go.Figure()
fig.add_trace(go.Scatterpolar(k_zero.trace))
fig.add_trace(go.Scatterpolar(k_one.trace))
fig.add_trace(go.Scatterpolar(k_two.trace))
fig.add_trace(go.Scatterpolar(k_three.trace))
fig.add_trace(go.Scatterpolar(k_four.trace))
title = "Stats of PCA vars (min maxed), clusters determined by k-means w/ k=4 on pca dataset"
fig.update_layout(
title=title,
polar=dict(
radialaxis=dict(
visible=True,
range=[0, 10]
)),
showlegend=True
)
fig.show()
df_model_pca["cluster_km"].value_counts()
4 28430 1 26172 0 23287 3 7046 2 631 Name: cluster_km, dtype: int64
There's a cluster with around 650 clients, we can safely classify them in the atypical data category. Likely the result of an input error or an atypical behavior. Those are clients that we cannot interpret.
We can see at first glance than a cluster containing around 7000 clients displays an average discontent higher than the other clusters. It can be seen as clients who are not satisfied with Olist/had a bad experience on the site.
A group that is less distinguishable with those variables but still very important is the cluster that shares a lot of common stats with the two other majorities, except time_impact which is way higher in avg. than the other clusters. It points to very recent clients as time_impact (PC1) is negatively correlated with the variables that are used to determine the age of the account (namely the recency, and delta_creation, which is the agreed upon data pointing towards the first order of the account) higher values in these variables indicate that the account is recent and/or used recently.This is a group we must fidelize.
We also have two majorities sharing the same stats. in avg., maj1 is remarkable as it has less involvement and less satisfaction than maj2.
Note : Cluster names cannot be given here as the re exec of KMeans will not always result the same cluster_number, so rerunning the script will give different numbers to clusters.
We can use UMAP to try and visualize them.
reducer = UMAP(n_components=3, n_jobs=-1)
df_umap = df_model_pca[[
"time_impact", "general_involvment", "overall_discontentment",
"overall_satisfaction", "value_and_volume", "cluster_km"
]].copy()
embedding = reducer.fit_transform(df_umap.drop(columns=["cluster_km"]))
df_reduced = pd.DataFrame(embedding)
df_reduced["cluster_km"] = df_umap["cluster_km"].astype(str) # Index kept, convert to str for plotly
marker_style = {
"size": 5,
}
fig = px.scatter_3d(
data_frame=df_reduced, x=0,
y=1, z=2, color="cluster_km",
width=4 * pc_dpi, height=3 * pc_dpi,
)
fig.update_layout(
margin=dict(l=40, r=40, t=40, b=40),
title="Visualisation of dataset using UMAP reducer",
)
fig.update_traces(marker=marker_style)
fig.show()
Even though UMAP abstracted the variables we can see that indeed, the clusters we associated to unsatisfied customers is clearly visible. As well the the atypicals. The two majorities are well defined and we can see that the group we associated with the new customers is a sort of an extension of the two large majorities.
We will study in details those groups before studying the maintenance suggestions.
We can go back to the original dataset and remove the clusters defined in RFM.
df_cx.drop(columns=["cluster_kmeans_4", "cluster_DBSCAN", "k_cluster_name"], inplace=True)
df_cx.columns
Index(['customer_uid', 'most_ancient_order_dt', 'most_recent_order_dt',
'recency', 'frequency', 'monetary', 'num_orders', 'lat', 'lon',
'order_id_list', 'rating_avg', 'rating_ratio', 'comment_ratio',
'has_rated', 'has_commented', 'delta_delivery', 'expected_reality',
'early_on_time', 'distance_cx_seller', 'delta_days', 'delta_creation',
'cluster_km'],
dtype='object')
df_cx.head()
| customer_uid | most_ancient_order_dt | most_recent_order_dt | recency | frequency | monetary | num_orders | lat | lon | order_id_list | ... | comment_ratio | has_rated | has_commented | delta_delivery | expected_reality | early_on_time | distance_cx_seller | delta_days | delta_creation | cluster_km | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 2017-05-16 15:05:35 | 2017-05-16 15:05:35 | 44850283.0 | 0.053939 | 146.87 | 1 | -20.509897 | -47.397866 | [88493] | ... | 0.0 | 1 | 0 | 8 days 19:30:00 | -11 days | 1 | 346.9776 | -11.0 | 1969 | 4 |
| 1 | 2 | 2018-01-12 20:48:24 | 2018-01-12 20:48:24 | 24007314.0 | 0.100769 | 335.48 | 1 | -23.726853 | -46.545746 | [90419] | ... | 0.0 | 1 | 0 | 16 days 15:52:55 | -8 days | 1 | 413.9531 | -8.0 | 1728 | 4 |
| 2 | 3 | 2018-05-19 16:07:45 | 2018-05-19 16:07:45 | 13051353.0 | 0.185360 | 157.73 | 1 | -23.527788 | -46.660310 | [22558] | ... | 0.0 | 1 | 0 | 26 days 01:51:06 | 1 days | 0 | 29.5741 | 1.0 | 1601 | 3 |
| 3 | 4 | 2018-03-13 16:06:38 | 2018-03-13 16:06:38 | 18840220.0 | 0.128406 | 173.30 | 1 | -23.496930 | -46.185352 | [32181] | ... | 0.0 | 1 | 0 | 14 days 23:57:47 | -13 days | 1 | 19.3538 | -13.0 | 1668 | 0 |
| 4 | 5 | 2018-07-29 09:51:30 | 2018-07-29 09:51:30 | 6939528.0 | 0.348612 | 252.25 | 1 | -22.987222 | -47.151073 | [69903] | ... | 1.0 | 1 | 1 | 11 days 11:04:18 | -6 days | 1 | 219.7266 | -6.0 | 1531 | 0 |
5 rows × 22 columns
df_cx.columns
df_model_pca_no_dbs["most_ancient_order_dt"] = df_cx["most_ancient_order_dt"] # Indexes are kept
df_model_pca_no_dbs.drop(columns=["cluster_km"]).to_pickle(path="../pickles/dataset_pca_maintenance.pkl") # Export for maintenance
To name the clusters we can go with
--
We cannot predict which x number will be y group name so we will use the input method and use the radars to reference the correct groups
## Important : Maj 1 is the less satisfied majority, it will be important later !
atypc_cx = int(input("Atypical : "))
new_cx = int(input("New customers : "))
maj1 = int(input("Maj1 : "))
maj2 = int(input("Maj2 : "))
unsat_cx = int(input("Unsatisfied : "))
rename_clusters = {
atypc_cx: "atypical", new_cx: "new_client",
maj1: "majority_1", maj2: "majority_2",
unsat_cx: "unsatisfied"
}
Foreword : We will drop the cluster containing atypical/outliers customers, a cluster size of less than 1% of the dataset will either be difficult to interpret or too time and resource consuming and not cost effective at a scale that small.
def name_clusters(row, replacement_dict):
key = int(row["cluster_km"])
return replacement_dict[key]
df_cx["kmeans_name"] = df_cx.apply(func=name_clusters, args=([rename_clusters]), axis=1)
# Pre drop
len(df_cx)
85566
# Removing atypical group
df_cx = df_cx[df_cx["kmeans_name"] != "atypical"]
# Post drop
len(df_cx)
84935
Focusing on the big picture first, we have two majorities which seem to share a lot, but UMAP showed that several variables were significatively different. On the radar chart we can hypothesize that, because it is the involvement component that mainly differs them, we need to look at comments and ratings ratios. The ratings/satisfaction seem, although one cluster shows a minor decrease in satisfaction, the most important differences.
As we saw in early visualizations, both ratios are really close to 1 or 0 (most customers only ordered once). A plot will not easily capture the difference, we might as well use the average per cluster
## Involvment :
maj1_avg_comment_ratio = np.average(df_cx[df_cx["kmeans_name"] == "majority_1"]["comment_ratio"].values)
maj2_avg_comment_ratio = np.average(df_cx[df_cx["kmeans_name"] == "majority_2"]["comment_ratio"].values)
print(f"maj1 average comment ratio = {maj1_avg_comment_ratio}, maj2 average comment ratio = {maj2_avg_comment_ratio}")
maj1 average comment ratio = 1.0, maj2 average comment ratio = 0.0
maj1_avg_rating_ratio = np.average(df_cx[df_cx["kmeans_name"] == "majority_1"]["rating_ratio"].values)
maj2_avg_rating_ratio = np.average(df_cx[df_cx["kmeans_name"] == "majority_2"]["rating_ratio"].values)
print(f"maj1 average rating ratio = {maj1_avg_rating_ratio}, maj2 average rating ratio = {maj2_avg_rating_ratio}")
maj1 average rating ratio = 1.0, maj2 average rating ratio = 1.0
While the average rating ratio is as expected, it seems the major factor separating the two clusters is the fact that majority 2 has not left a comment. It explains why there was a drop in the PC summarizing the general involvement.
Let's take a look at the ratings, maybe there are more differences. If the ratings are significantly different, we will use two boxplots to get a better overall view.
maj1_avg_ratings = np.average(df_cx[df_cx["kmeans_name"] == "majority_1"]["rating_avg"].values)
maj2_avg_ratings = np.average(df_cx[df_cx["kmeans_name"] == "majority_2"]["rating_avg"].values)
print(f"maj1 average rating ratio = {maj1_avg_ratings}, maj2 average rating ratio = {maj2_avg_ratings}")
maj1 average rating ratio = 3.9249579703499924, maj2 average rating ratio = 4.473619416109743
It's significant enough, let's take a broader look
fig, ((ax1), (ax2)) = plt.subplots(
ncols=1,
nrows=2,
figsize=(8, 5),
dpi=pc_dpi,
)
ax1.boxplot(x=df_cx[df_cx["kmeans_name"] == "majority_1"]["rating_avg"].values, vert=False, showmeans=True, widths=0.6)
ax2.boxplot(x=df_cx[df_cx["kmeans_name"] == "majority_2"]["rating_avg"].values, vert=False, showmeans=True, widths=0.6)
###
# Titles/Lables
ax1.set(yticklabels=[])
ax1.set(title="Repartition of ratings, first majority")
ax2.set(yticklabels=[])
ax2.set(title="Repartition of ratings, second majority")
#
###
fig.tight_layout()
plt.show()
While both ratings are high, the first majority seems to be less enthusiast than the second majority. Not enough to be an issue as ratings are well above the 2.5 mark, but it would be important to keep an eye on majority 1 to avoid a downward trend. Applying the same measures as we will recommend for the overall dissatisfied customers, although not at the same intensity, might be a way to prevent clients from majority 1 to become unsatisfied.
To conclude, we cannot merge the majorities, and we also need to keep an eye on majority 1, keeping them happy and making them happier as, by their population, they represent a sizable portion of the dataset. (around 25-30%) -- Renaming customers part of maj1 to can_shift. They are not unhappy yet, but could become so.
df_cx["kmeans_name"].replace(to_replace={"majority_1": "can_shift"}, inplace=True)
It is necessary to have a preliminary diagnostic to :
We will take a look at average ratings, delays and distances to try and find a pattern if it exists.
# Rating repartition :
fig, ((ax1), (ax2)) = plt.subplots(
ncols=1,
nrows=2,
figsize=(8, 5),
dpi=pc_dpi,
)
ax1.boxplot(x=df_cx[df_cx["kmeans_name"] == "unsatisfied"]["rating_avg"].values, vert=False, showmeans=True, widths=0.6)
ax2.boxplot(x=df_cx["rating_avg"].values, vert=False, showmeans=True, widths=0.6)
###
# Titles/Lables
ax1.set(yticklabels=[])
ax1.set(title="Repartition of ratings, unsatisfied customers")
ax2.set(yticklabels=[])
ax2.set(title="Repartition of ratings, dataset wide")
#
###
fig.tight_layout()
plt.show()
No surprise here, it was expected from unsat. cluster to have ratings that are way lower in contrast to the rest of the dataset. It looks like the split we did (people that rated at most 2.5 in average, see #2).
We expect higher delta days between expected and actual delivery. We will take a look at distance from seller as well.
# Using the same code from #2 :
amount_eot_all = len(df_cx[df_cx["early_on_time"] == True])
amount_eot_unsat = len(df_cx[(df_cx["early_on_time"] == True) & (df_cx["kmeans_name"] == "unsatisfied")])
percentage_all = (amount_eot_all / len(df_cx)) * 100
percentage_unsat = (amount_eot_unsat / len(df_cx[df_cx["kmeans_name"] == "unsatisfied"])) * 100
print(f"There is {percentage_all}% of who have been delivered on time on the whole dataset")
print(f"There is {percentage_unsat}% of who have been delivered on time or early in the unsat group")
print(f"""
In total, {amount_eot_unsat} orders have been delivered on time or early in the unsat group,\
group size is {len(df_cx[df_cx["kmeans_name"] == "unsatisfied"])}
""")
There is 91.14852534290928% of who have been delivered on time on the whole dataset There is 0.0709622480840193% of who have been delivered on time or early in the unsat group In total, 5 orders have been delivered on time or early in the unsat group, group size is 7046
It is safe to say that the unsatisfied customers are the ones who have been impacted by delays the most. Recommendations for this group, considering the data we have, is to offer an incentive/a compensation for the delay issue, and to address it as soon as possible to avoid general insatisfaction.
With these groups identified, it is possible to give a reliable explanation of which client is in which group. Maintenance will be done on the PCA dataset exported earlier to avoid differences. We can export the dataset containing client important info as an output, both pickle and csv will be used to maximize compatibility.
Recommendations :
df_cx.to_pickle(path="../pickles/final_clustering.pkl")
df_cx.to_csv(path_or_buf="../data/final/final_clustering.csv", index=False)